Name: Vishv Patel
Student Number: 8977662
4 histogram and the whole point is to see the difference between class or 8 histogram
The goal is to classify the species of an Iris flower given the dimensions of parts of the flower. The classification is binary, there are two classes: *virginica and non-virginica.*
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
# Step 1: Obtain the data using the sklearn load_iris method
iris = load_iris(as_frame=True)
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['target'] = iris.target
display(iris_df.head())
# Step 2: Redefine 2 classes for the target variable: virginica and non-virginica
iris_df['target'] = iris_df['target'].map({0: 'non-virginica', 1: 'non-virginica', 2: 'virginica'})
| sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | target | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | 0 |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | 0 |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | 0 |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | 0 |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | 0 |
# Split the dataset into two separate dataframes based on the target variable classes
virginica_df = iris_df[iris_df['target'] == 'virginica']
non_virginica_df = iris_df[iris_df['target'] == 'non-virginica']
# Calculate descriptive statistics for each class
virginica_stats = virginica_df.describe()
non_virginica_stats = non_virginica_df.describe()
# Display the descriptive statistics for each class
print("Descriptive Statistics for 'virginica' class:")
display(virginica_stats)
print("\nDescriptive Statistics for 'non-virginica' class:")
display(non_virginica_stats)
Descriptive Statistics for 'virginica' class:
| sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | |
|---|---|---|---|---|
| count | 50.00000 | 50.000000 | 50.000000 | 50.00000 |
| mean | 6.58800 | 2.974000 | 5.552000 | 2.02600 |
| std | 0.63588 | 0.322497 | 0.551895 | 0.27465 |
| min | 4.90000 | 2.200000 | 4.500000 | 1.40000 |
| 25% | 6.22500 | 2.800000 | 5.100000 | 1.80000 |
| 50% | 6.50000 | 3.000000 | 5.550000 | 2.00000 |
| 75% | 6.90000 | 3.175000 | 5.875000 | 2.30000 |
| max | 7.90000 | 3.800000 | 6.900000 | 2.50000 |
Descriptive Statistics for 'non-virginica' class:
| sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | |
|---|---|---|---|---|
| count | 100.000000 | 100.000000 | 100.000000 | 100.000000 |
| mean | 5.471000 | 3.099000 | 2.861000 | 0.786000 |
| std | 0.641698 | 0.478739 | 1.449549 | 0.565153 |
| min | 4.300000 | 2.000000 | 1.000000 | 0.100000 |
| 25% | 5.000000 | 2.800000 | 1.500000 | 0.200000 |
| 50% | 5.400000 | 3.050000 | 2.450000 | 0.800000 |
| 75% | 5.900000 | 3.400000 | 4.325000 | 1.300000 |
| max | 7.000000 | 4.400000 | 5.100000 | 1.800000 |
# Set the style of seaborn
sns.set(style="whitegrid")
# Plot histograms for each feature, separated by class
for feature in iris.feature_names:
plt.figure(figsize=(8, 6))
sns.histplot(data=iris_df, x=feature, hue='target', kde=True, palette="husl", bins=20)
plt.title(f'Histogram of {feature} by Class')
plt.xlabel(feature)
plt.ylabel('Frequency')
plt.show()
# Concatenate the target column to the feature DataFrame
iris_df_with_target = iris_df.copy()
iris_df_with_target['target'] = iris.target
# Compute the correlation matrix
co_matrix_with_target = iris_df_with_target.corr()
# Set up the matplotlib figure
plt.figure(figsize=(8, 6))
# Generate a heatmap of the correlation matrix
sns.heatmap(co_matrix_with_target, annot=True, cmap='plasma', fmt=".2f", linewidths=0.5)
# Add title and display the plot
plt.title('Correlation Matrix of Iris Dataset Features with Target')
plt.show()
# Set the style of seaborn
sns.set(style="ticks")
# Plot pairplot
sns.pairplot(iris_df, hue="target", palette="husl")
plt.suptitle("Pairplot of Iris Dataset Features by Class", y=1.02)
plt.legend(title='Class', loc='upper right')
plt.show()
No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
# Set the style of seaborn
sns.set(style="whitegrid")
# Plot boxplot
plt.figure(figsize=(8, 6))
sns.boxplot(data=iris_df, x="target", y="petal length (cm)", palette="husl")
plt.title("Boxplot of Petal Length by Class")
plt.xlabel("Class")
plt.ylabel("Petal Length (cm)")
plt.show()
# Set the style of seaborn
sns.set(style="whitegrid")
# Plot violin plot
plt.figure(figsize=(8, 6))
sns.violinplot(data=iris_df, x="target", y="sepal width (cm)", palette="husl")
plt.title("Violin Plot of Setal Width by Class")
plt.xlabel("Class")
plt.ylabel("Sepal Width (cm)")
plt.show()
The correlation coefficients between the target column and the four features are as follows: sepal length has a correlation of 0.78, petal length has a correlation of 0.95, and petal width has a correlation of 0.96, indicating a strong positive relationship. On the other hand, sepal width has a correlation of -0.43, suggesting a moderate negative relationship, which is the lowest correlation among the features.
Histogram Insights
Sepal Length: The histograms show that 'virginica' tends to have slightly longer sepal lengths compared to 'non-virginica', as seen by the rightward shift in the distribution for 'virginica'.
Sepal Width: There's overlap between the distributions of 'virginica' and 'non-virginica', indicating that sepal width alone might not be a strong discriminator between the two classes.
Petal Length: 'Virginica' consistently exhibits longer petal lengths compared to 'non-virginica', as evidenced by the rightward shift and higher frequency in the longer petal length range.
Petal Width: Similar to petal length, 'virginica' tends to have wider petals compared to 'non-virginica', showing a clear distinction between the two classes.
Sepal Length vs. Sepal Width: There's significant overlap between the classes, suggesting that these two features alone might not be sufficient to distinctly separate the classes.
Petal Length vs. Petal Width: 'Virginica' tends to have both longer and wider petals compared to 'non-virginica', indicating that these features combined might be more effective in distinguishing between the classes.
Diagonal Plots: Along the diagonal, histograms for each feature are displayed. They show the distribution of each feature within each class. Consistent patterns observed in these histograms reinforce the insights gained from individual histograms.
# First split: Splitting into train and temporary set (validation + test)
X_train, X_temp, y_train, y_temp = train_test_split(iris_df.drop('target', axis=1), iris_df['target'], test_size=30, random_state=42)
# Second split: Splitting the temporary set into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
# Verify the shapes of the resulting sets
print("Train set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)
print("Test set shape:", X_test.shape)
Train set shape: (120, 4) Validation set shape: (15, 4) Test set shape: (15, 4)
log_reg1 = LogisticRegression()
log_reg1.fit(X_train[['sepal length (cm)']], y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression()
log_reg2 = LogisticRegression()
log_reg2.fit(X_train[['sepal length (cm)', 'sepal width (cm)']], y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression()
log_reg3 = LogisticRegression()
log_reg3.fit(X_train[['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)']], y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression()
log_reg4 = LogisticRegression()
log_reg4.fit(X_train[['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']], y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression()
Model 1
predict1 = log_reg1.predict(X_val[['sepal length (cm)']])
predict_prob1 = log_reg1.predict_proba(X_val[['sepal length (cm)']])
if isinstance(y_val, pd.DataFrame):
y_val = y_val.squeeze()
df1 = pd.DataFrame({
'probability of predicting verginica': predict_prob1[:, 1],
'actual prediction by the model': predict1,
'Ground Truth': y_val
})
df1
| probability of predicting verginica | actual prediction by the model | Ground Truth | |
|---|---|---|---|
| 26 | 0.064510 | non-virginica | non-virginica |
| 18 | 0.217912 | non-virginica | non-virginica |
| 118 | 0.937717 | virginica | virginica |
| 145 | 0.671933 | virginica | virginica |
| 78 | 0.336388 | non-virginica | non-virginica |
| 127 | 0.382264 | non-virginica | virginica |
| 108 | 0.671933 | virginica | virginica |
| 55 | 0.217912 | non-virginica | non-virginica |
| 30 | 0.044226 | non-virginica | non-virginica |
| 29 | 0.036520 | non-virginica | non-virginica |
| 141 | 0.753228 | virginica | virginica |
| 110 | 0.578831 | virginica | virginica |
| 19 | 0.077646 | non-virginica | non-virginica |
| 132 | 0.529589 | virginica | virginica |
| 64 | 0.185827 | non-virginica | non-virginica |
Model 2
predict2 = log_reg2.predict(X_val[['sepal length (cm)', 'sepal width (cm)']])
predict_prob2 = log_reg2.predict_proba(X_val[['sepal length (cm)', 'sepal width (cm)']])
if isinstance(y_val, pd.DataFrame):
y_val = y_val.squeeze()
df2 = pd.DataFrame({
'probability of predicting verginica': predict_prob2[:, 1],
'actual prediction by the model': predict2,
'ground truth': y_val
})
df2
| probability of predicting verginica | actual prediction by the model | ground truth | |
|---|---|---|---|
| 26 | 0.050796 | non-virginica | non-virginica |
| 18 | 0.145047 | non-virginica | non-virginica |
| 118 | 0.949865 | virginica | virginica |
| 145 | 0.669280 | virginica | virginica |
| 78 | 0.347378 | non-virginica | non-virginica |
| 127 | 0.379491 | non-virginica | virginica |
| 108 | 0.732581 | virginica | virginica |
| 55 | 0.237155 | non-virginica | non-virginica |
| 30 | 0.041289 | non-virginica | non-virginica |
| 29 | 0.032140 | non-virginica | non-virginica |
| 141 | 0.739476 | virginica | virginica |
| 110 | 0.546099 | virginica | virginica |
| 19 | 0.048771 | non-virginica | non-virginica |
| 132 | 0.556691 | virginica | virginica |
| 64 | 0.193357 | non-virginica | non-virginica |
Model 3
predict3 = log_reg3.predict(X_val[['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)']])
predict_prob3 = log_reg3.predict_proba(X_val[['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)']])
if isinstance(y_val, pd.DataFrame):
y_val = y_val.squeeze()
df3 = pd.DataFrame({
'probability of predicting verginica': predict_prob3[:, 1],
'actual prediction by the model': predict3,
'ground truth': y_val
})
df3
| probability of predicting verginica | actual prediction by the model | ground truth | |
|---|---|---|---|
| 26 | 0.000016 | non-virginica | non-virginica |
| 18 | 0.000013 | non-virginica | non-virginica |
| 118 | 0.998416 | virginica | virginica |
| 145 | 0.697669 | virginica | virginica |
| 78 | 0.227787 | non-virginica | non-virginica |
| 127 | 0.520907 | virginica | virginica |
| 108 | 0.958540 | virginica | virginica |
| 55 | 0.262784 | non-virginica | non-virginica |
| 30 | 0.000020 | non-virginica | non-virginica |
| 29 | 0.000020 | non-virginica | non-virginica |
| 141 | 0.585897 | virginica | virginica |
| 110 | 0.622702 | virginica | virginica |
| 19 | 0.000009 | non-virginica | non-virginica |
| 132 | 0.921539 | virginica | virginica |
| 64 | 0.015291 | non-virginica | non-virginica |
Model 4
predict4 = log_reg4.predict(X_val[['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']])
predict_prob4 = log_reg4.predict_proba(X_val[['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']])
if isinstance(y_val, pd.DataFrame):
y_val = y_val.squeeze()
df4 = pd.DataFrame({
'probability of predicting verginica': predict_prob4[:, 1],
'actual prediction by the model': predict4,
'ground truth': y_val
})
df4
| probability of predicting verginica | actual prediction by the model | ground truth | |
|---|---|---|---|
| 26 | 0.000009 | non-virginica | non-virginica |
| 18 | 0.000006 | non-virginica | non-virginica |
| 118 | 0.998534 | virginica | virginica |
| 145 | 0.873922 | virginica | virginica |
| 78 | 0.207005 | non-virginica | non-virginica |
| 127 | 0.572730 | virginica | virginica |
| 108 | 0.946564 | virginica | virginica |
| 55 | 0.170670 | non-virginica | non-virginica |
| 30 | 0.000008 | non-virginica | non-virginica |
| 29 | 0.000008 | non-virginica | non-virginica |
| 141 | 0.820222 | virginica | virginica |
| 110 | 0.728198 | virginica | virginica |
| 19 | 0.000004 | non-virginica | non-virginica |
| 132 | 0.956114 | virginica | virginica |
| 64 | 0.016196 | non-virginica | non-virginica |
Accuracy Score for Each Models
from sklearn.metrics import accuracy_score
accuracy1 = accuracy_score(y_val, predict1)
print("Accuracy of the logistic regression model1 is: ", accuracy1)
accuracy2 = accuracy_score(y_val, predict2)
print("Accuracy of the logistic regression model2 is: ", accuracy2)
accuracy3 = accuracy_score(y_val, predict3)
print("Accuracy of the logistic regression model3 is: ", accuracy3)
accuracy4 = accuracy_score(y_val, predict4)
print("Accuracy of the logistic regression model4 is: ", accuracy4)
Accuracy of the logistic regression model1 is: 0.9333333333333333 Accuracy of the logistic regression model2 is: 0.9333333333333333 Accuracy of the logistic regression model3 is: 1.0 Accuracy of the logistic regression model4 is: 1.0
Model 1
X_val_feature = X_val['sepal length (cm)']
decision_boundary = -log_reg1.intercept_ / log_reg1.coef_[0]
plt.scatter(X_val_feature, y_val)
plt.axvline(x=decision_boundary)
plt.xlabel('Sepal Length (cm)')
plt.show()
Model 2
decision_boundary_x1 = np.linspace(X_val['sepal length (cm)'].min(), X_val['sepal length (cm)'].max(), 10)
decision_boundary_x2 = -log_reg2.intercept_ / log_reg2.coef_[0][1] - log_reg2.coef_[0][0] / log_reg2.coef_[0][1] * decision_boundary_x1
# Get unique classes in y_val
classes = np.unique(y_val)
# Define colors for each class
colors = ['orange', 'lightgreen'] # You may need to add more colors if you have more than three classes
# Plot the scatter plot with different colors for each class
for class_label, color in zip(classes, colors):
plt.scatter(X_val[y_val == class_label]['sepal length (cm)'],
X_val[y_val == class_label]['sepal width (cm)'],
color=color, label=f'Class {class_label}')
# Plot the decision boundary
plt.plot(decision_boundary_x1, decision_boundary_x2, color='black', label='Decision Boundary')
# Add labels and title
plt.xlabel('sepal length (cm)')
plt.ylabel('sepal width (cm)')
plt.title('Decision Boundary')
# Add legend
plt.legend()
# Show plot
plt.show()
Model 3
import plotly.graph_objs as go
import plotly
plotly.offline.init_notebook_mode()
min_feature_value1 = X_val['sepal length (cm)'].min(axis=0) - 1
max_feature_value1 = X_val['sepal length (cm)'].max(axis=0) + 1
min_feature_value2 = X_val['sepal width (cm)'].min(axis=0) - 1
max_feature_value2 = X_val['sepal width (cm)'].max(axis=0) + 1
x1, x2 = np.meshgrid(np.linspace(min_feature_value1, max_feature_value1, 100),
np.linspace(min_feature_value2, max_feature_value2, 100))
x3 = (-log_reg3.intercept_ - log_reg3.coef_[0][0] * x1 - log_reg3.coef_[0][1] * x2) / log_reg3.coef_[0][2]
decision_boundary_surface = go.Surface(x=x1, y=x2, z=x3, colorscale='Viridis', opacity=0.5)
data_scatter = go.Scatter3d(x=X_val['sepal length (cm)'], y=X_val['sepal width (cm)'], z=X_val['petal length (cm)'],
mode='markers',
marker=dict(size=5,color=['orange', 'lightgreen', 'lightblue'], colorscale='Bluered', opacity=0.8))
layout = go.Layout(title='3D plot with decision boundary',
scene=dict(xaxis_title='sepal length (cm)',
yaxis_title='sepal width (cm)',
zaxis_title='petal length (cm)'),
margin=dict(l=0, r=0, b=0, t=0))
fig = go.Figure(data=[decision_boundary_surface, data_scatter], layout=layout)
fig.show()
# Make predictions on the test set
test_predictions = log_reg3.predict(X_test[['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)']])
# Calculate accuracy on the test set
test_accuracy = accuracy_score(y_test, test_predictions)
print(f"Accuracy on the test set: {test_accuracy:.4f}")
Accuracy on the test set: 1.0000